In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
data = pd.read_csv('vehicle.csv')
In [3]:
data.head()
Out[3]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
0 95 48.0 83.0 178.0 72.0 10 162.0 42.0 20.0 159 176.0 379.0 184.0 70.0 6.0 16.0 187.0 197 van
1 91 41.0 84.0 141.0 57.0 9 149.0 45.0 19.0 143 170.0 330.0 158.0 72.0 9.0 14.0 189.0 199 van
2 104 50.0 106.0 209.0 66.0 10 207.0 32.0 23.0 158 223.0 635.0 220.0 73.0 14.0 9.0 188.0 196 car
3 93 41.0 82.0 159.0 63.0 9 144.0 46.0 19.0 143 160.0 309.0 127.0 63.0 6.0 10.0 199.0 207 van
4 85 44.0 70.0 205.0 103.0 52 149.0 45.0 19.0 144 241.0 325.0 188.0 127.0 9.0 11.0 180.0 183 bus
In [4]:
data.describe().T
Out[4]:
count mean std min 25% 50% 75% max
compactness 846.0 93.678487 8.234474 73.0 87.00 93.0 100.0 119.0
circularity 841.0 44.828775 6.152172 33.0 40.00 44.0 49.0 59.0
distance_circularity 842.0 82.110451 15.778292 40.0 70.00 80.0 98.0 112.0
radius_ratio 840.0 168.888095 33.520198 104.0 141.00 167.0 195.0 333.0
pr.axis_aspect_ratio 844.0 61.678910 7.891463 47.0 57.00 61.0 65.0 138.0
max.length_aspect_ratio 846.0 8.567376 4.601217 2.0 7.00 8.0 10.0 55.0
scatter_ratio 845.0 168.901775 33.214848 112.0 147.00 157.0 198.0 265.0
elongatedness 845.0 40.933728 7.816186 26.0 33.00 43.0 46.0 61.0
pr.axis_rectangularity 843.0 20.582444 2.592933 17.0 19.00 20.0 23.0 29.0
max.length_rectangularity 846.0 147.998818 14.515652 118.0 137.00 146.0 159.0 188.0
scaled_variance 843.0 188.631079 31.411004 130.0 167.00 179.0 217.0 320.0
scaled_variance.1 844.0 439.494076 176.666903 184.0 318.00 363.5 587.0 1018.0
scaled_radius_of_gyration 844.0 174.709716 32.584808 109.0 149.00 173.5 198.0 268.0
scaled_radius_of_gyration.1 842.0 72.447743 7.486190 59.0 67.00 71.5 75.0 135.0
skewness_about 840.0 6.364286 4.920649 0.0 2.00 6.0 9.0 22.0
skewness_about.1 845.0 12.602367 8.936081 0.0 5.00 11.0 19.0 41.0
skewness_about.2 845.0 188.919527 6.155809 176.0 184.00 188.0 193.0 206.0
hollows_ratio 846.0 195.632388 7.438797 181.0 190.25 197.0 201.0 211.0
In [5]:
data.corr()
Out[5]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio
compactness 1.000000 0.689786 0.791707 0.691081 0.091779 0.148249 0.812770 -0.788736 0.814248 0.676143 0.764361 0.818674 0.585845 -0.250603 0.236685 0.157670 0.298528 0.365552
circularity 0.689786 1.000000 0.797180 0.625051 0.154283 0.251407 0.858265 -0.827246 0.856603 0.965729 0.806791 0.850863 0.935950 0.053080 0.144968 -0.011869 -0.106339 0.045652
distance_circularity 0.791707 0.797180 1.000000 0.771748 0.158684 0.264621 0.907949 -0.913020 0.896273 0.775149 0.865710 0.890541 0.706950 -0.227001 0.114665 0.266049 0.146027 0.333648
radius_ratio 0.691081 0.625051 0.771748 1.000000 0.665363 0.450486 0.738480 -0.792946 0.712744 0.571083 0.798294 0.725598 0.541325 -0.181520 0.049112 0.174469 0.382912 0.472339
pr.axis_aspect_ratio 0.091779 0.154283 0.158684 0.665363 1.000000 0.648861 0.103832 -0.183492 0.079566 0.127322 0.273738 0.089750 0.122454 0.152860 -0.058539 -0.032180 0.240201 0.267760
max.length_aspect_ratio 0.148249 0.251407 0.264621 0.450486 0.648861 1.000000 0.165998 -0.180053 0.161603 0.305943 0.319033 0.143745 0.189752 0.295638 0.015446 0.043491 -0.026184 0.143919
scatter_ratio 0.812770 0.858265 0.907949 0.738480 0.103832 0.165998 1.000000 -0.973504 0.992078 0.810017 0.951672 0.996328 0.800577 -0.028006 0.074376 0.213512 0.005171 0.118504
elongatedness -0.788736 -0.827246 -0.913020 -0.792946 -0.183492 -0.180053 -0.973504 1.000000 -0.950405 -0.776150 -0.938313 -0.956488 -0.766671 0.103535 -0.052243 -0.186027 -0.114846 -0.216769
pr.axis_rectangularity 0.814248 0.856603 0.896273 0.712744 0.079566 0.161603 0.992078 -0.950405 1.000000 0.813135 0.938182 0.992316 0.798522 -0.015711 0.083219 0.215200 -0.019066 0.099481
max.length_rectangularity 0.676143 0.965729 0.775149 0.571083 0.127322 0.305943 0.810017 -0.776150 0.813135 1.000000 0.746657 0.797485 0.866554 0.041283 0.136077 0.001660 -0.104437 0.076770
scaled_variance 0.764361 0.806791 0.865710 0.798294 0.273738 0.319033 0.951672 -0.938313 0.938182 0.746657 1.000000 0.949766 0.781016 0.112452 0.036165 0.196202 0.014434 0.086708
scaled_variance.1 0.818674 0.850863 0.890541 0.725598 0.089750 0.143745 0.996328 -0.956488 0.992316 0.797485 0.949766 1.000000 0.797318 -0.016642 0.077288 0.202398 0.006648 0.103839
scaled_radius_of_gyration 0.585845 0.935950 0.706950 0.541325 0.122454 0.189752 0.800577 -0.766671 0.798522 0.866554 0.781016 0.797318 1.000000 0.192245 0.166785 -0.056067 -0.225882 -0.118597
scaled_radius_of_gyration.1 -0.250603 0.053080 -0.227001 -0.181520 0.152860 0.295638 -0.028006 0.103535 -0.015711 0.041283 0.112452 -0.016642 0.192245 1.000000 -0.088736 -0.126686 -0.752437 -0.804793
skewness_about 0.236685 0.144968 0.114665 0.049112 -0.058539 0.015446 0.074376 -0.052243 0.083219 0.136077 0.036165 0.077288 0.166785 -0.088736 1.000000 -0.035154 0.115728 0.097293
skewness_about.1 0.157670 -0.011869 0.266049 0.174469 -0.032180 0.043491 0.213512 -0.186027 0.215200 0.001660 0.196202 0.202398 -0.056067 -0.126686 -0.035154 1.000000 0.077460 0.205115
skewness_about.2 0.298528 -0.106339 0.146027 0.382912 0.240201 -0.026184 0.005171 -0.114846 -0.019066 -0.104437 0.014434 0.006648 -0.225882 -0.752437 0.115728 0.077460 1.000000 0.893869
hollows_ratio 0.365552 0.045652 0.333648 0.472339 0.267760 0.143919 0.118504 -0.216769 0.099481 0.076770 0.086708 0.103839 -0.118597 -0.804793 0.097293 0.205115 0.893869 1.000000
In [6]:
sns.pairplot(data,diag_kind='kde')
C:\Users\Praveen Kshma Mitra\Anaconda3\lib\site-packages\statsmodels\nonparametric\kde.py:447: RuntimeWarning: invalid value encountered in greater
  X = X[np.logical_and(X > clip[0], X < clip[1])] # won't work for two columns.
C:\Users\Praveen Kshma Mitra\Anaconda3\lib\site-packages\statsmodels\nonparametric\kde.py:447: RuntimeWarning: invalid value encountered in less
  X = X[np.logical_and(X > clip[0], X < clip[1])] # won't work for two columns.
Out[6]:
<seaborn.axisgrid.PairGrid at 0x1fba8ba2400>
In [7]:
data.isna().sum()
Out[7]:
compactness                    0
circularity                    5
distance_circularity           4
radius_ratio                   6
pr.axis_aspect_ratio           2
max.length_aspect_ratio        0
scatter_ratio                  1
elongatedness                  1
pr.axis_rectangularity         3
max.length_rectangularity      0
scaled_variance                3
scaled_variance.1              2
scaled_radius_of_gyration      2
scaled_radius_of_gyration.1    4
skewness_about                 6
skewness_about.1               1
skewness_about.2               1
hollows_ratio                  0
class                          0
dtype: int64
In [8]:
null = data[data.isna().any(axis=1)]
null.head()
Out[8]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
5 107 NaN 106.0 172.0 50.0 6 255.0 26.0 28.0 169 280.0 957.0 264.0 85.0 5.0 9.0 181.0 183 bus
9 93 44.0 98.0 NaN 62.0 11 183.0 36.0 22.0 146 202.0 505.0 152.0 64.0 4.0 14.0 195.0 204 car
19 101 56.0 100.0 215.0 NaN 10 208.0 32.0 24.0 169 227.0 651.0 223.0 74.0 6.0 5.0 186.0 193 car
35 100 46.0 NaN 172.0 67.0 9 157.0 43.0 20.0 150 170.0 363.0 184.0 67.0 17.0 7.0 192.0 200 van
66 81 43.0 68.0 125.0 57.0 8 149.0 46.0 19.0 146 169.0 323.0 172.0 NaN NaN 18.0 179.0 184 bus
In [9]:
data['circularity'].replace(to_replace = np.NaN,value = data['circularity'].mean(), inplace = True)
data['distance_circularity'].replace(to_replace = np.NaN,value = data['distance_circularity'].mean(), inplace = True)
data['radius_ratio'].replace(to_replace = np.NaN,value = data['radius_ratio'].mean(), inplace = True)
data['pr.axis_aspect_ratio'].replace(to_replace = np.NaN,value = data['pr.axis_aspect_ratio'].mean(), inplace = True)
data['scatter_ratio'].replace(to_replace = np.NaN,value = data['scatter_ratio'].mean(), inplace = True)
data['elongatedness'].replace(to_replace = np.NaN,value = data['elongatedness'].mean(), inplace = True)
data['pr.axis_rectangularity'].replace(to_replace = np.NaN,value = data['pr.axis_rectangularity'].mean(), inplace = True)
data['scaled_variance'].replace(to_replace = np.NaN,value = data['scaled_variance'].mean(), inplace = True)
data['scaled_variance.1'].replace(to_replace = np.NaN,value = data['scaled_variance.1'].mean(), inplace = True)
data['scaled_radius_of_gyration'].replace(to_replace = np.NaN,value = data['scaled_radius_of_gyration'].mean(), inplace = True)
data['scaled_radius_of_gyration.1'].replace(to_replace = np.NaN,value = data['scaled_radius_of_gyration.1'].mean(), inplace = True)
data['skewness_about'].replace(to_replace = np.NaN,value = data['skewness_about'].mean(), inplace = True)
data['skewness_about.2'].replace(to_replace = np.NaN,value = data['skewness_about.2'].mean(), inplace = True)
data['skewness_about.1'].replace(to_replace = np.NaN,value = data['skewness_about.1'].mean(), inplace = True)
In [10]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 846 entries, 0 to 845
Data columns (total 19 columns):
compactness                    846 non-null int64
circularity                    846 non-null float64
distance_circularity           846 non-null float64
radius_ratio                   846 non-null float64
pr.axis_aspect_ratio           846 non-null float64
max.length_aspect_ratio        846 non-null int64
scatter_ratio                  846 non-null float64
elongatedness                  846 non-null float64
pr.axis_rectangularity         846 non-null float64
max.length_rectangularity      846 non-null int64
scaled_variance                846 non-null float64
scaled_variance.1              846 non-null float64
scaled_radius_of_gyration      846 non-null float64
scaled_radius_of_gyration.1    846 non-null float64
skewness_about                 846 non-null float64
skewness_about.1               846 non-null float64
skewness_about.2               846 non-null float64
hollows_ratio                  846 non-null int64
class                          846 non-null object
dtypes: float64(14), int64(4), object(1)
memory usage: 125.7+ KB
In [11]:
data.boxplot(figsize=(40,20))
#It's showing outliers in different veriables in data
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fbb4993be0>
In [ ]:
 

There are outliers in some features features:-

radius_ratio
pr.axis_aspect_ratio
max.length_aspect_ratio
scaled_variance
scaled_variance.1
scaled_radius_of_gyration.1
skewness_about
skewness_about.1

In [12]:
data['skewness_about'].max()
Out[12]:
22.0
In [13]:
pd.crosstab(data['max.length_aspect_ratio'],data['class'])
pd.crosstab(data['radius_ratio'],data['class'])
pd.crosstab(data['pr.axis_aspect_ratio'],data['class'])
pd.crosstab(data['scaled_variance'],data['class'])
pd.crosstab(data['scaled_variance.1'],data['class'])
#pd.crosstab(data['scaled_radius_of_gyration.1'],data['class'])#87
#pd.crosstab(data['skewness_about'],data['class'])#18
#pd.crosstab(data['skewness_about.1'],data['class'])
Out[13]:
class bus car van
scaled_variance.1
184.0 0 1 0
191.0 0 1 0
192.0 0 1 0
193.0 0 1 0
194.0 0 0 1
195.0 0 0 1
196.0 0 1 1
197.0 0 1 0
200.0 0 0 1
203.0 0 0 2
204.0 0 0 2
205.0 0 0 1
206.0 0 0 1
207.0 0 0 2
208.0 0 0 2
209.0 0 1 2
211.0 0 0 1
212.0 0 0 2
213.0 0 0 1
216.0 0 1 0
218.0 0 0 1
219.0 0 1 1
220.0 0 1 0
221.0 0 1 1
222.0 0 0 1
223.0 0 0 2
224.0 0 1 0
225.0 0 0 2
227.0 0 1 1
229.0 0 2 0
... ... ... ...
741.0 0 1 0
748.0 0 2 0
752.0 0 1 0
756.0 0 1 0
757.0 0 1 0
766.0 1 0 0
776.0 1 0 0
816.0 1 0 0
822.0 1 0 0
833.0 1 0 0
838.0 1 0 0
844.0 1 0 0
855.0 1 0 0
857.0 1 0 0
866.0 1 0 0
870.0 1 0 0
891.0 1 0 0
892.0 1 0 0
904.0 1 0 0
923.0 1 0 0
928.0 2 0 0
954.0 1 0 0
956.0 1 0 0
957.0 1 0 0
966.0 1 0 0
968.0 1 0 0
982.0 1 0 0
987.0 1 0 0
998.0 1 0 0
1018.0 1 0 0

423 rows × 3 columns

In [14]:
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
IQR
Out[14]:
compactness                     13.00
circularity                      9.00
distance_circularity            28.00
radius_ratio                    54.00
pr.axis_aspect_ratio             8.00
max.length_aspect_ratio          3.00
scatter_ratio                   51.00
elongatedness                   13.00
pr.axis_rectangularity           4.00
max.length_rectangularity       22.00
scaled_variance                 50.00
scaled_variance.1              268.50
scaled_radius_of_gyration       49.00
scaled_radius_of_gyration.1      8.00
skewness_about                   7.00
skewness_about.1                14.00
skewness_about.2                 9.00
hollows_ratio                   10.75
dtype: float64
In [15]:
data = data[~((data < (Q1 - 1.5 * IQR)) |(data > (Q3 + 1.5 * IQR))).any(axis=1)]
In [16]:
data.boxplot(figsize=(40,20))
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fbb87876d8>
In [17]:
#Outliers have been removed.
In [45]:
from sklearn.preprocessing import LabelEncoder
labels = LabelEncoder()
data['class'] = labels.fit_transform(data['class'])
data.head()
C:\Users\Praveen Kshma Mitra\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
Out[45]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
0 95 48.000000 83.0 178.0 72.0 10 162.0 42.0 20.0 159 176.0 379.0 184.0 70.0 6.0 16.0 187.0 197 2
1 91 41.000000 84.0 141.0 57.0 9 149.0 45.0 19.0 143 170.0 330.0 158.0 72.0 9.0 14.0 189.0 199 2
2 104 50.000000 106.0 209.0 66.0 10 207.0 32.0 23.0 158 223.0 635.0 220.0 73.0 14.0 9.0 188.0 196 1
3 93 41.000000 82.0 159.0 63.0 9 144.0 46.0 19.0 143 160.0 309.0 127.0 63.0 6.0 10.0 199.0 207 2
5 107 44.828775 106.0 172.0 50.0 6 255.0 26.0 28.0 169 280.0 957.0 264.0 85.0 5.0 9.0 181.0 183 0
In [19]:
sns.pairplot(data, diag_kind= 'kde', hue='class')
C:\Users\Praveen Kshma Mitra\Anaconda3\lib\site-packages\statsmodels\nonparametric\kde.py:487: RuntimeWarning: invalid value encountered in true_divide
  binned = fast_linbin(X, a, b, gridsize) / (delta * nobs)
C:\Users\Praveen Kshma Mitra\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:34: RuntimeWarning: invalid value encountered in double_scalars
  FAC1 = 2*(np.pi*bw/RANGE)**2
Out[19]:
<seaborn.axisgrid.PairGrid at 0x1fbb890f9e8>
Error in callback <function flush_figures at 0x000001FBA77F02F0> (for post_execute):
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
~\Anaconda3\lib\site-packages\ipykernel\pylab\backend_inline.py in flush_figures()
    115         # ignore the tracking, just draw and close all figures
    116         try:
--> 117             return show(True)
    118         except Exception as e:
    119             # safely show traceback if in IPython, else raise

~\Anaconda3\lib\site-packages\ipykernel\pylab\backend_inline.py in show(close, block)
     37             display(
     38                 figure_manager.canvas.figure,
---> 39                 metadata=_fetch_figure_metadata(figure_manager.canvas.figure)
     40             )
     41     finally:

~\Anaconda3\lib\site-packages\IPython\core\display.py in display(include, exclude, metadata, transient, display_id, *objs, **kwargs)
    304             publish_display_data(data=obj, metadata=metadata, **kwargs)
    305         else:
--> 306             format_dict, md_dict = format(obj, include=include, exclude=exclude)
    307             if not format_dict:
    308                 # nothing to display (e.g. _ipython_display_ took over)

~\Anaconda3\lib\site-packages\IPython\core\formatters.py in format(self, obj, include, exclude)
    178             md = None
    179             try:
--> 180                 data = formatter(obj)
    181             except:
    182                 # FIXME: log the exception

<C:\Users\Praveen Kshma Mitra\Anaconda3\lib\site-packages\decorator.py:decorator-gen-9> in __call__(self, obj)

~\Anaconda3\lib\site-packages\IPython\core\formatters.py in catch_format_error(method, self, *args, **kwargs)
    222     """show traceback on failed format call"""
    223     try:
--> 224         r = method(self, *args, **kwargs)
    225     except NotImplementedError:
    226         # don't warn on NotImplementedErrors

~\Anaconda3\lib\site-packages\IPython\core\formatters.py in __call__(self, obj)
    339                 pass
    340             else:
--> 341                 return printer(obj)
    342             # Finally look for special method names
    343             method = get_real_method(obj, self.print_method)

~\Anaconda3\lib\site-packages\IPython\core\pylabtools.py in <lambda>(fig)
    242 
    243     if 'png' in formats:
--> 244         png_formatter.for_type(Figure, lambda fig: print_figure(fig, 'png', **kwargs))
    245     if 'retina' in formats or 'png2x' in formats:
    246         png_formatter.for_type(Figure, lambda fig: retina_figure(fig, **kwargs))

~\Anaconda3\lib\site-packages\IPython\core\pylabtools.py in print_figure(fig, fmt, bbox_inches, **kwargs)
    126 
    127     bytes_io = BytesIO()
--> 128     fig.canvas.print_figure(bytes_io, **kw)
    129     data = bytes_io.getvalue()
    130     if fmt == 'svg':

~\Anaconda3\lib\site-packages\matplotlib\backend_bases.py in print_figure(self, filename, dpi, facecolor, edgecolor, orientation, format, bbox_inches, **kwargs)
   2058                     bbox_artists = kwargs.pop("bbox_extra_artists", None)
   2059                     bbox_inches = self.figure.get_tightbbox(renderer,
-> 2060                             bbox_extra_artists=bbox_artists)
   2061                     pad = kwargs.pop("pad_inches", None)
   2062                     if pad is None:

~\Anaconda3\lib\site-packages\matplotlib\figure.py in get_tightbbox(self, renderer, bbox_extra_artists)
   2364 
   2365         for a in artists:
-> 2366             bbox = a.get_tightbbox(renderer)
   2367             if bbox is not None and (bbox.width != 0 or bbox.height != 0):
   2368                 bb.append(bbox)

~\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in get_tightbbox(self, renderer, call_axes_locator, bbox_extra_artists)
   4354                 bb.append(bb_yaxis)
   4355 
-> 4356         self._update_title_position(renderer)
   4357         bb.append(self.get_window_extent(renderer))
   4358 

~\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in _update_title_position(self, renderer)
   2545                     if (ax.xaxis.get_label_position() == 'top' or
   2546                             ax.xaxis.get_ticks_position() in choices):
-> 2547                         bb = ax.xaxis.get_tightbbox(renderer)
   2548                     else:
   2549                         bb = ax.get_window_extent(renderer)

~\Anaconda3\lib\site-packages\matplotlib\axis.py in get_tightbbox(self, renderer)
   1161             return
   1162 
-> 1163         ticks_to_draw = self._update_ticks()
   1164 
   1165         self._update_label_position(renderer)

~\Anaconda3\lib\site-packages\matplotlib\axis.py in _update_ticks(self)
   1086             tick.set_label1(label)
   1087             tick.set_label2(label)
-> 1088         minor_locs = self.get_minorticklocs()
   1089         minor_labels = self.minor.formatter.format_ticks(minor_locs)
   1090         minor_ticks = self.get_minor_ticks(len(minor_locs))

~\Anaconda3\lib\site-packages\matplotlib\axis.py in get_minorticklocs(self)
   1328         """Get the array of minor tick locations in data coordinates."""
   1329         # Remove minor ticks duplicating major ticks.
-> 1330         major_locs = self.major.locator()
   1331         minor_locs = self.minor.locator()
   1332         transform = self._scale.get_transform()

~\Anaconda3\lib\site-packages\matplotlib\ticker.py in __call__(self)
   2076     def __call__(self):
   2077         vmin, vmax = self.axis.get_view_interval()
-> 2078         return self.tick_values(vmin, vmax)
   2079 
   2080     def tick_values(self, vmin, vmax):

~\Anaconda3\lib\site-packages\matplotlib\ticker.py in tick_values(self, vmin, vmax)
   2084         vmin, vmax = mtransforms.nonsingular(
   2085             vmin, vmax, expander=1e-13, tiny=1e-14)
-> 2086         locs = self._raw_ticks(vmin, vmax)
   2087 
   2088         prune = self._prune

~\Anaconda3\lib\site-packages\matplotlib\ticker.py in _raw_ticks(self, vmin, vmax)
   2023         if self._nbins == 'auto':
   2024             if self.axis is not None:
-> 2025                 nbins = np.clip(self.axis.get_tick_space(),
   2026                                 max(1, self._min_n_ticks - 1), 9)
   2027             else:

~\Anaconda3\lib\site-packages\matplotlib\axis.py in get_tick_space(self)
   2173         ends = self.axes.transAxes.transform([[0, 0], [1, 0]])
   2174         length = ((ends[1][0] - ends[0][0]) / self.axes.figure.dpi) * 72
-> 2175         tick = self._get_tick(True)
   2176         # There is a heuristic here that the aspect ratio of tick text
   2177         # is no more than 3:1

~\Anaconda3\lib\site-packages\matplotlib\axis.py in _get_tick(self, major)
   1904         else:
   1905             tick_kw = self._minor_tick_kw
-> 1906         return XTick(self.axes, 0, '', major=major, **tick_kw)
   1907 
   1908     def _get_label(self):

~\Anaconda3\lib\site-packages\matplotlib\axis.py in __init__(self, axes, loc, label, size, width, color, tickdir, pad, labelsize, labelcolor, zorder, gridOn, tick1On, tick2On, label1On, label2On, major, labelrotation, grid_color, grid_linestyle, grid_linewidth, grid_alpha, **kw)
    156         self.apply_tickdir(tickdir)
    157 
--> 158         self.tick1line = self._get_tick1line()
    159         self.tick2line = self._get_tick2line()
    160         self.gridline = self._get_gridline()

~\Anaconda3\lib\site-packages\matplotlib\axis.py in _get_tick1line(self)
    462                           linestyle='None', marker=self._tickmarkers[0],
    463                           markersize=self._size,
--> 464                           markeredgewidth=self._width, zorder=self._zorder)
    465         l.set_transform(self.axes.get_xaxis_transform(which='tick1'))
    466         self._set_artist_props(l)

~\Anaconda3\lib\site-packages\matplotlib\lines.py in __init__(self, xdata, ydata, linewidth, linestyle, color, marker, markersize, markeredgewidth, markeredgecolor, markerfacecolor, markerfacecoloralt, fillstyle, antialiased, dash_capstyle, solid_capstyle, dash_joinstyle, solid_joinstyle, pickradius, drawstyle, markevery, **kwargs)
    380         self._color = None
    381         self.set_color(color)
--> 382         self._marker = MarkerStyle(marker, fillstyle)
    383 
    384         self._markevery = None

~\Anaconda3\lib\site-packages\matplotlib\markers.py in __init__(self, marker, fillstyle)
    241         self._marker_function = None
    242         self.set_fillstyle(fillstyle)
--> 243         self.set_marker(marker)
    244 
    245     def _recache(self):

~\Anaconda3\lib\site-packages\matplotlib\markers.py in set_marker(self, marker)
    314 
    315         self._marker = marker
--> 316         self._recache()
    317 
    318     def get_path(self):

~\Anaconda3\lib\site-packages\matplotlib\markers.py in _recache(self)
    254         self._capstyle = 'butt'
    255         self._filled = True
--> 256         self._marker_function()
    257 
    258     def __bool__(self):

~\Anaconda3\lib\site-packages\matplotlib\markers.py in _set_tickdown(self)
    767 
    768     def _set_tickdown(self):
--> 769         self._transform = Affine2D().scale(1.0, -1.0)
    770         self._snap_threshold = 1.0
    771         self._filled = False

~\Anaconda3\lib\site-packages\matplotlib\transforms.py in scale(self, sx, sy)
   1989         scale_mtx = np.array(
   1990             [[sx, 0.0, 0.0], [0.0, sy, 0.0], [0.0, 0.0, 1.0]], float)
-> 1991         self._mtx = np.dot(scale_mtx, self._mtx)
   1992         self.invalidate()
   1993         return self

KeyboardInterrupt: 
In [20]:
fig, ax = plt.subplots(figsize=(20,20)) 
sns.heatmap(data.corr(),annot=True)  
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fbb89190b8>
In [21]:
Correlation = data.corr()
indices = np.where((Correlation < 0.1) & (Correlation > -0.1))
indices = [(Correlation.index[x], Correlation.columns[y]) for x, y in zip(*indices) if x != y and x < y]
if len(indices) == 0:
    print ("some corrrelation between them.")
else:
    indices = np.where((Correlation < 0.1) & (Correlation > -0.1))
    indices = [(Correlation.index[x]) for x, y in zip(*indices) if x != y and x < y and y == 18]
    if len(indices) == 0:
        print ('corrrelation with class column.')
    else:
        print ("No good correlation between class and other variables: ", indices)
No good correlation between class and other variables:  ['compactness', 'distance_circularity', 'max.length_rectangularity', 'skewness_about.1', 'skewness_about.2']
In [22]:
#Scaler
from scipy.stats import zscore
from scipy.stats import zscore 
data_new = data.apply(zscore)

PCA

In [29]:
from sklearn import model_selection
columns = ['class','compactness', 'distance_circularity', 'pr.axis_aspect_ratio', 'max.length_rectangularity', 'skewness_about.1', 'skewness_about.2']
X = data_new.drop(columns, axis=1)
y = data_new['class']

test_size = 0.30 # taking 70:30 training and test set
seed = 10  # Random numbmer seeding for reapeatability of the code

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=test_size, random_state=seed)
In [30]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(X_train)
Variance = pca.explained_variance_ratio_
singular_values = pca.singular_values_
print(Variance)
singular_values
[6.14936180e-01 1.87126431e-01 8.47581754e-02 5.67008867e-02
 2.69610772e-02 1.25179021e-02 6.39166956e-03 4.69058860e-03
 3.28475302e-03 1.87347828e-03 7.12291135e-04 4.65667556e-05]
Out[30]:
array([65.11637661, 35.92053286, 24.17497329, 19.77288901, 13.63464465,
        9.29054044,  6.63868956,  5.68707395,  4.75911899,  3.59417761,
        2.2161746 ,  0.56664778])
In [36]:
total = sum(Variance)
var_exp = [( i /total ) * 100 for i in sorted(Variance, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print("Cumulative Variance Explained", cum_var_exp)
Cumulative Variance Explained [ 61.49361804  80.20626113  88.68207867  94.35216734  97.04827506
  98.30006527  98.93923222  99.40829108  99.73676638  99.92411421
  99.99534332 100.        ]
In [37]:
plt.figure(figsize=(10 , 5))
plt.bar(range(1, Variance.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, Variance.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
In [40]:
fig = plt.figure(figsize=(10,10))
plt.plot(range(1, Variance.size + 1), var_exp, 'bo-', linewidth=2)
plt.title('Elbow Plot')
plt.xlabel('Principal Component')
plt.ylabel('Explained Varience Ratio')
plt.show()
In [41]:
pca = PCA(n_components=.98)
pca.fit(X_train)
Out[41]:
PCA(copy=True, iterated_power='auto', n_components=0.98, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)
In [42]:
X_train_new = pca.transform(X_train)
X_test_new = pca.transform(X_test)
In [43]:
Proj_data_Train = pd.DataFrame(X_train_new)  
Proj_data_Train = Proj_data_Train.join(y_train)
In [44]:
plt.subplots(figsize=(10,10))
sns.heatmap(Proj_data_Train.corr(), annot=True, linewidths=1)
Out[44]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fbcd5d6208>
In [ ]:
 

SVM

In [47]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train_new, y_train)
In [ ]:
svc.score(X_test_new, y_test)
In [ ]:
import multiprocessing 
from sklearn.model_selection import GridSearchCV
param_grid = [{'kernel': ['linear', 'rbf'], 'C': [0.01, 0.05, 0.5, 1]}]
g_s = GridSearchCV(estimator=SVC(), param_grid=param_grid,scoring='accuracy', cv=10, n_jobs=multiprocessing.cpu_count())
In [ ]:
g_s.fit(X_train_new, y_train)
g_s.best_estimator_
In [ ]:
g_s.best_score_
In [ ]:
#here we see that by grid search, we get C=1 and kernel =rbf as the best fit.
In [ ]:
svc = SVC(C=1, kernel='rbf')
svc.fit(X_train_new, y_train)

print("Accuracy on training set:",format(svc.score(X_train_pca, y_train)))
print("Accuracy on test set:",format(svc.score(X_test_pca, y_test)))
In [ ]:
from sklearn.model_selection import cross_val_score

scoresTrain = cross_val_score(svc, X_train_new, y_train, cv=10)
print("Train Accuracy:",(scoresTrain.mean()))
In [ ]:
scoresTest = cross_val_score(svc, X_test_new, y_test, cv=10)
print("Test Accuracy:",(scoresTest.mean()))